library(rtracklayer)
library(tidyverse)
library(ggforce)
library(GenomicRanges)
library(reshape2)
library(RColorBrewer)

Pool different time 0 replicates

# Sort bed file by chr and start position
cat A.bed B.bed .. | sort -k1,1 -k2,2n | mergeBed > AB.bed

# Merge the bed files (they need to be sorted by chr first and then start position) 
# Take into account the strand (-s), merge only overlapping position (-d -1)
# Sum the score and report the strand (-c 5,6)
bedtools merge -i AFKS_head_sorted.bed -s -d -1 -c 5,6 -o sum,distinct

# One-line command
cat *.bed | sort -k1,1 -k2,2n | bedtools merge -s -d -1 -c 5,6 -o sum,distinct > merged_ctss_timepoint_0.bed
# Load data
ATAC_granges <- import("../Data/ATAC_idr.optimal_peak.narrowPeak.gz", format="gz")
CAGE_granges <- import("../Data/ctss_files/merged_ctss_timepoint_0.bed", format="bed")
genome(ATAC_granges) <- "hg38"
genome(CAGE_granges) <- "hg38"
ATAC_granges
## GRanges object with 258265 ranges and 6 metadata columns:
##            seqnames              ranges strand |        name     score
##               <Rle>           <IRanges>  <Rle> | <character> <numeric>
##        [1]    chr15   89087912-89088828      * |        <NA>      1000
##        [2]     chr1 206202745-206204098      * |        <NA>      1000
##        [3]    chr11     6234200-6235551      * |        <NA>      1000
##        [4]    chr11     3797056-3798236      * |        <NA>      1000
##        [5]     chr8   25458212-25459500      * |        <NA>      1000
##        ...      ...                 ...    ... .         ...       ...
##   [258261]    chr10 106089928-106090175      * |        <NA>       200
##   [258262]    chr10 104639508-104639928      * |        <NA>       200
##   [258263]    chr10 104638719-104639064      * |        <NA>       200
##   [258264]    chr10 103131426-103131736      * |        <NA>       200
##   [258265]    chr10 101874851-101875892      * |        <NA>       357
##            signalValue    pValue    qValue      peak
##              <numeric> <numeric> <numeric> <integer>
##        [1]     22.9179   3542.55   3536.38       348
##        [2]     15.9319   3417.05   3410.94       464
##        [3]     17.5065   3325.87   3319.80       718
##        [4]     20.1593   3283.13   3277.08       639
##        [5]     18.0436   3234.07   3228.04       389
##        ...         ...       ...       ...       ...
##   [258261]     2.09155   3.82812   2.18792        93
##   [258262]     2.09155   3.82812   2.18792       390
##   [258263]     2.09155   3.82812   2.18792       237
##   [258264]     2.09155   3.82812   2.18792       207
##   [258265]     2.09155   3.82812   2.18792       177
##   -------
##   seqinfo: 24 sequences from hg38 genome; no seqlengths
CAGE_granges
## GRanges object with 4142444 ranges and 2 metadata columns:
##             seqnames    ranges strand |        name     score
##                <Rle> <IRanges>  <Rle> | <character> <numeric>
##         [1]     chr1     17494      - |           -         2
##         [2]     chr1     17496      - |           -         2
##         [3]     chr1    139023      + |           +         1
##         [4]     chr1    139289      + |           +         1
##         [5]     chr1    184489      - |           -         1
##         ...      ...       ...    ... .         ...       ...
##   [4142440]     chrY  56871054      - |           -         1
##   [4142441]     chrY  56871169      - |           -         2
##   [4142442]     chrY  56874150      + |           +         1
##   [4142443]     chrY  56874573      - |           -         1
##   [4142444]     chrY  56881313      - |           -         1
##   -------
##   seqinfo: 24 sequences from hg38 genome; no seqlengths

Extract the positive set

# Extend ATAC peaks 250 bp in both directions
ATAC_granges_peaks <- GRanges(seqnames = seqnames(ATAC_granges),
                              ranges = IRanges(start = start(ATAC_granges) + ATAC_granges$peak, width = 1),
                              strand = strand(ATAC_granges))
genome(ATAC_granges_peaks) <- "hg38"

start(ATAC_granges_peaks) <- start(ATAC_granges_peaks) - 250
end(ATAC_granges_peaks) <- end(ATAC_granges_peaks) + 250
ranges(ATAC_granges_peaks)
## IRanges object with 258265 ranges and 0 metadata columns:
##                start       end     width
##            <integer> <integer> <integer>
##        [1]  89088010  89088510       501
##        [2] 206202959 206203459       501
##        [3]   6234668   6235168       501
##        [4]   3797445   3797945       501
##        [5]  25458351  25458851       501
##        ...       ...       ...       ...
##   [258261] 106089771 106090271       501
##   [258262] 104639648 104640148       501
##   [258263] 104638706 104639206       501
##   [258264] 103131383 103131883       501
##   [258265] 101874778 101875278       501
ATAC_granges_peaks
## GRanges object with 258265 ranges and 0 metadata columns:
##            seqnames              ranges strand
##               <Rle>           <IRanges>  <Rle>
##        [1]    chr15   89088010-89088510      *
##        [2]     chr1 206202959-206203459      *
##        [3]    chr11     6234668-6235168      *
##        [4]    chr11     3797445-3797945      *
##        [5]     chr8   25458351-25458851      *
##        ...      ...                 ...    ...
##   [258261]    chr10 106089771-106090271      *
##   [258262]    chr10 104639648-104640148      *
##   [258263]    chr10 104638706-104639206      *
##   [258264]    chr10 103131383-103131883      *
##   [258265]    chr10 101874778-101875278      *
##   -------
##   seqinfo: 24 sequences from hg38 genome; no seqlengths

Remove blacklist overlapping regions

# Load blacklist and remove overlapping ATAC regions
blacklist_new <- import("../Data/hg38.blacklist_new.bed", format="bed")

overlapping_blacklist <- queryHits(findOverlaps(ATAC_granges_peaks, blacklist_new))
paste("There are", length(overlapping_blacklist), "ATAC ranges overlapping the blacklist")
## [1] "There are 3 ATAC ranges overlapping the blacklist"
ATAC_granges_peaks <- ATAC_granges_peaks[-overlapping_blacklist]

Extract the windows profile

# Report time execution
report_time_execution <- function(fun){
  start_time <- Sys.time()
  output <- fun
  print(Sys.time() - start_time)
return(output)
}

# Return score if the position is present, 0 otherwise
get_count_vector <- function(pos, df){
  if (pos %in% df$atac_relative_pos) {
  return(df$score[which(df$atac_relative_pos == pos)])
} else {return(0)}
}

# Return score vector for each position for both strands
get_count_vector_both_strands <- function(df){
  plus_count_vector <- sapply(1:501, get_count_vector, df = df[df$strand == "+",])
  minus_count_vector <- sapply(1:501, get_count_vector, df = df[df$strand == "-",])
  return(c(plus_count_vector, minus_count_vector))
}
  
# Return the CAGE profiles of the (CAGE) overlapping ATAC regions
get_chr_windows_profiles <- function(cage_granges, atac_granges, chr){
  # Select chromosome
  cage_granges <- cage_granges[seqnames(cage_granges) == chr]
  atac_granges <- atac_granges[seqnames(atac_granges) == chr]
  # Add all information into one df
  overlaps <- findOverlaps(cage_granges, atac_granges)                   # Index of overlapping CAGE fragment
  # Check if there are ATAC positive windows overlapping CAGE data
  if (length(overlaps) > 0){                                                   
    df <- cage_granges[queryHits(overlaps)]                              # Keep only overlapping CAGE data
    df$index_overlapping_atac <- subjectHits(overlaps)                   # Add index of overlapping ATAC   
    df %>% as_tibble() %>%                                               # Add ATAC start site and relative position
      mutate(atac_start = start(atac_granges[subjectHits(overlaps)]),
             atac_relative_pos = start - atac_start + 1) -> df
    # Extract profiles of each (CAGE) overlapping ATAC region
    profiles <- by(data = df, 
                   INDICES = df$index_overlapping_atac, 
                   FUN = function(x){get_count_vector_both_strands(x)})
    profiles <- data.frame(do.call(rbind, profiles))
    colnames(profiles) <- c(paste("Plus_", 1:501, sep = ""), paste("Minus_", 1:501, sep = ""))
    # Add metadata information
    profiles <- profiles %>% mutate(atac_start = start(atac_granges[as.numeric(rownames(profiles))]),
                                    chr = chr) %>% relocate(c(chr, atac_start), .before = Plus_1) 
    return(profiles)
  } 
  else {
    print(paste(chr, "contains no overlapping windows"))
    return(NULL)
  }
}

# Return the windows profiles for all chromosomes
get_windows_profiles <- function(cage_granges, atac_granges){
  chromosomes <- unique(seqnames(cage_granges))
  list_chr_profiles <- lapply(chromosomes, function(x) 
        {get_chr_windows_profiles(cage_granges = cage_granges, 
         atac_granges = atac_granges,
         chr = x)})
  output_list <- list()
  output_list$profiles <- data.frame(do.call(rbind, list_chr_profiles))  
  output_list$metadata <- output_list$profiles %>% select(chr, atac_start)
  output_list$profiles <- output_list$profiles %>% select(-chr, -atac_start)
  return(output_list)
}

atac_windows <- report_time_execution(get_windows_profiles(CAGE_granges, 
                                                           ATAC_granges_peaks))
## Time difference of 22.06792 mins
rows = nrow(atac_windows$metadata)
paste("Extracted profiles:", rows)
## [1] "Extracted profiles: 138277"
krows <- round(rows/1000)

Rank approach to remove intra overlapping ATAC

## Compute some measure of the rank and use it to remove the overlaps: total cage score (1), max TSS score (2)

# The following function take as input the indexes of overlapping ranges and 
# it returns the index of the ranges with largest total score
get_index_largest_score <- function(ranges_indexes, metadata){
  clash_total_score <- sapply(ranges_indexes, function(x) 
    metadata$total_score[x])
  return(ranges_indexes[which.max(clash_total_score)])
} 

# The following function remove the overlaps keeping the range with highest rank (large score)
# It takes the output of findoverlaps and for each range it return the index of the overlapping 
# range with largest score

remove_overlaps_by_rank <- function(windows){
  # Add measures of rank
  metadata_granges <- GRanges(seqnames = windows$metadata$chr, 
                              ranges = IRanges(start = windows$metadata$atac_start, width = 501))
  windows$metadata <- windows$metadata %>% mutate(total_score = apply(windows$profiles, 1, sum),
                                                  max_score = apply(windows$profiles, 1, max))
  # Get index largest score by overlaps
  overlaps <- as.data.frame(findOverlaps(metadata_granges))
  top_rank_index <- by(overlaps, INDICES = overlaps$queryHits, 
                 function(x){
                   get_index_largest_score(x$subjectHits, windows$metadata)})
  top_rank_index <- unique(do.call(cbind, list(top_rank_index)))
  # Keep only ranges with highest rank 
  windows$profiles <- windows$profiles[top_rank_index,]
  windows$metadata <- windows$metadata[top_rank_index,]
  return(windows)
}

pos_windows <- report_time_execution(remove_overlaps_by_rank(atac_windows))
## Time difference of 38.59569 secs
rows = nrow(pos_windows$metadata)
paste("Extracted profiles after removing overlaps by ranks:", rows)
## [1] "Extracted profiles after removing overlaps by ranks: 87407"
krows <- round(rows/1000)

# Filter by minimal CAGE requirement (TSS with atleast 2)
pos_windows$profiles <- pos_windows$profiles[pos_windows$metadata$max_score >= 2,]
pos_windows$metadata <- pos_windows$metadata[pos_windows$metadata$max_score >= 2,] %>% select(-total_score, -max_score)

rows = nrow(pos_windows$metadata)
paste("Extracted profiles after CAGE filtering:", rows)
## [1] "Extracted profiles after CAGE filtering: 60522"
krows <- round(rows/1000)

pos_windows$metadata

Exploration of the obtained profiles

## Explore the resulting profiles

# Plot chr distribution
plot_chr_distribution <- function(window_metadata, 
                                  title = "Chr distribution",
                                  save = FALSE,
                                  filename = "chr_distribution.png"){
  window_metadata %>% group_by(chr) %>% count() %>% 
  ggplot(aes(x = factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep="")), 
             y = n)) +
  geom_bar(stat = "identity", col = "black", fill = brewer.pal(8,"Dark2")[1]) +
  labs(title = title, x = "Chromosome", y = "N. windows") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust=1,size = 12)) -> plot
  if (save) {ggsave(paste("../Plots/input_exploration/", 
                          filename, 
                          sep = ""), 
                    plot, height = 5, dpi = 300) }
  return(plot)
}

plot_chr_distribution(pos_windows$metadata, 
                      paste("Chr distribution (Positive windows ", krows, "k)", sep=""),
                      save=TRUE, filename="positive_set/chr_distribution_posFiltered.png")
## Saving 7 x 5 in image

# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
get_cage_distribution_by_peak_position <- function(peaks_profile_df){
  apply(peaks_profile_df, 2, sum) %>% as_tibble() %>% 
  mutate(pos = c(-250:250, -250:250), strand = c(rep("+", 501), rep("-", 501))) %>% 
  rename(score = value) %>% relocate(score, .after = strand) %>%
  mutate(score = ifelse(strand == "-", -score, score))
}
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(pos_windows$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw() -> plot_cage_by_peak_pos
plot_cage_by_peak_pos

ggsave("../Plots/input_exploration/positive_set/cage_by_peak_pos_posFiltered.png", 
       plot_cage_by_peak_pos, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Plot distribution of ATAC-Seq peaks CAGE total coverage 
get_windows_total_cage_score_distribution <- function(peaks_profile_df){
  peaks_profile_df %>% 
    count(apply(peaks_profile_df, 1, sum)) %>% 
    rename(total_score = "apply(peaks_profile_df, 1, sum)", n_atac_peaks = n) %>%
    relocate(total_score, .after = n_atac_peaks) 
}

windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(pos_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none") -> windows_total_cage_score_plot1
windows_total_cage_score_plot1

ggsave("../Plots/input_exploration/positive_set/windows_total_cage_score1_posFiltered.png", 
       windows_total_cage_score_plot1, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 250000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank()) -> windows_total_cage_score_plot2
windows_total_cage_score_plot2

ggsave("../Plots/input_exploration/positive_set/windows_total_cage_score2_posFiltered.png", 
       windows_total_cage_score_plot2, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Plot the maximum tss score of each window
max_tss_score_distribution <- rownames_to_column(pos_windows$profiles, var = "atac_start") %>% 
  mutate(max_tss_score = apply(pos_windows$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  facet_zoom(ylim=c(0, 500), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none") -> max_tss_score_plot
max_tss_score_plot

ggsave("../Plots/input_exploration/positive_set/max_tss_score_plot_posFiltered.png", 
       max_tss_score_plot, 
       height = 4.5, dpi = 300) 
## Saving 7 x 4.5 in image

Exploration of different chromosomes score distribution

# Get the distribution of CAGE score over one chr windows positions 
get_score_distribution_by_pos_one_chr <- function(list_windows, chr){
  windows_profile <- list_windows$profiles[list_windows$metadata == chr,]
  score_distribution <- get_cage_distribution_by_peak_position(windows_profile)
  score_distribution$chr = chr
  return(score_distribution)  
}

# Get the distribution of CAGE score for all chr
get_score_distribution_by_pos_all_chr <- function(list_windows){
  list_df <- lapply(unique(list_windows$metadata$chr), function(x){
    get_score_distribution_by_pos_one_chr(list_windows, as.character(x))}) 
  return(data.frame(do.call(rbind, list_df)))
}

windows_score_distribution_by_pos_all_chr <-   
  get_score_distribution_by_pos_all_chr(pos_windows)

windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Positive set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  coord_cartesian(ylim = c(-1000, 1000)) +
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw() -> windows_score_by_pos_by_chr
windows_score_by_pos_by_chr

ggsave("../Plots/input_exploration/positive_set/windows_score_by_pos_by_chr_posFiltered.png", 
     windows_score_by_pos_by_chr, 
     height = 7, width = 10, dpi = 300) 

# Free axis
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Positive set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),    
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep="")),
             scales="free") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw() -> windows_score_by_pos_by_chr
windows_score_by_pos_by_chr

ggsave("../Plots/input_exploration/positive_set/windows_score_by_pos_by_chr_posFiltered_freeAxis.png", 
     windows_score_by_pos_by_chr, 
     height = 7, width = 12, dpi = 300) 

Exploration of different chromosomes profiles

# Plot some profiles
plot_set_profiles <- function(windows,
                             chr,
                             details = FALSE,
                             title = "",
                             ylim = c(-50, 50), scales = "fixed",
                             range = 1:56, sort = FALSE,
                             save = FALSE, filename = "set_profiles.png"){
  windows_profile <- windows$profiles[windows$metadata$chr == chr,]
  windows_profile %>% 
    mutate(total_coverage = apply(windows_profile, 1, sum),
           max_coverage = apply(windows_profile, 1, max)) -> windows_profile
  # Add details (minimum TSS score and total score of the window)
  if (details){
    row_name <- paste("w_", 1:nrow(windows_profile), "\n(T=", 
                      as.character(total_coverage), ", \nM=", 
                      max_coverage, ")") 
  } else {row_name <- paste("w_", 1:nrow(windows_profile))}
  windows_profile %>%
    mutate(window = row_name) %>%
    relocate(c(window, total_coverage), .before = Plus_1) -> temp
  if (sort){
    temp %>% arrange(desc(total_coverage)) %>% slice(range) -> temp
  } 
  # Prepare for plotting
  temp <- temp %>% select(-total_coverage, -max_coverage) %>% slice(range) %>% melt()
  temp$value[grepl("^M", temp$variable)] <- -temp$value[grepl("^M", temp$variable)] 
  temp$strand[grepl("^M", temp$variable)] <- "-"
  temp$strand[grepl("^P", temp$variable)] <- "+"
  temp$variable <- as.numeric(gsub("\\D", "", temp$variable)) - 251
  # Plot
  temp %>% ggplot(aes(x = variable, y = value, color = strand)) + geom_line() + 
    facet_wrap(~window, ncol = 8, strip.position = "left", scales = scales) +
    coord_cartesian(ylim = ylim) +
    labs(title = paste("Windows profiles (", 
                       deparse(substitute(range)), ", ", chr, ")", sep = ""), 
         y = "TSS score",
         x = "Relative position to ATAC-Seq mid peak",
         fill = NA) + 
    theme_bw() -> plot
  ggsave(paste("../Plots/input_exploration/", filename, sep=""), 
         plot, 
         height = 9, width = 20, dpi = 300) 
  return(plot)
}

# Free axis
plot_set_profiles(pos_windows, chr="chr1", ylim=c(NA,NA), scales="free", sort=FALSE, save=TRUE, 
                  filename="positive_set/profiles1_pos.png") 
## Using window as id variables

plot_set_profiles(pos_windows, chr="chr6", ylim=c(NA,NA), scales="free", sort=FALSE, save=TRUE, 
                  filename="positive_set/profiles2_pos.png") 
## Using window as id variables

plot_set_profiles(pos_windows, chr="chr11", ylim=c(NA,NA), scales="free", sort=FALSE, save=TRUE, 
                  filename="positive_set/profiles3_pos.png") 
## Using window as id variables

plot_set_profiles(pos_windows, chr="chr2", ylim=c(NA,NA), scales="free", sort=FALSE, save=TRUE, 
                  filename="positive_set/profiles4_pos.png") 
## Using window as id variables

plot_set_profiles(pos_windows, chr="chr3", ylim=c(NA,NA), scales="free", sort=FALSE, save=TRUE, 
                  filename="positive_set/profiles5_pos.png") 
## Using window as id variables

plot_set_profiles(pos_windows, chr="chr4", ylim=c(NA,NA), scales="free", sort=FALSE, save=TRUE, 
                  filename="positive_set/profiles6_pos.png") 
## Using window as id variables

Extract the negative set

Export data and generate negative ranges by bedtools shuffle

# Export original ATAC data (before filtering them by overlaps) 

# Duplicate 6 times the ATAC_peaks so to use as -i to generate more ranges with shuffle
ATAC_granges_duplicated <- GRanges(rbind(as.data.frame(ATAC_granges_peaks),
                                         as.data.frame(ATAC_granges_peaks),
                                         as.data.frame(ATAC_granges_peaks),
                                         as.data.frame(ATAC_granges_peaks),
                                         as.data.frame(ATAC_granges_peaks),
                                         as.data.frame(ATAC_granges_peaks),
                                         as.data.frame(ATAC_granges_peaks)))
export(ATAC_granges_duplicated, "../Data/atac_positive_all_granges_501_duplicatedX6.bed", format = "bed")
# Generate negative set ranges (from pseudo_dreg directory) in such a way that:
# - They will be the same size, num and strand proportion as atac positive granges, blacklist and atac positive granges will not be included, don't allow overlaps
bedtools shuffle -i ../atac_positive_all_granges_501_duplicatedX6.bed -g ../hg38.bed -excl hg38.blacklist_new -excl ../atac_positive_all_granges_501.bed -noOverlapping > negative_set_duplicateX6_noOverlaps_v1.bed

Extraction and exploration of negative range profiles (shuffling 1807k ranges without overlaps, new blacklist)

# Import negative set ranges and check for overlaps
negative_set_granges <- import("../Data/pseudo_dreg_sets/negative_set_duplicateX6_noOverlaps_v3.bed", format = "bed")

# Check for overlaps with ATAC positive regions and between granges in the same file
findOverlaps(negative_set_granges, ATAC_granges_peaks)
## Hits object with 0 hits and 0 metadata columns:
##    queryHits subjectHits
##    <integer>   <integer>
##   -------
##   queryLength: 1807834 / subjectLength: 258262
paste("Ranges before extraction:", length(negative_set_granges))
## [1] "Ranges before extraction: 1807834"
paste("Within overlaps:", sum(countOverlaps(negative_set_granges) > 1))
## [1] "Within overlaps: 0"
# Extract CAGE profiles
neg_windows <- report_time_execution(get_windows_profiles(CAGE_granges, 
                                                          negative_set_granges))
## Time difference of 26.60591 mins
# Check for profiles without overlapping CAGE data
non_overlapping_ranges <- apply(neg_windows$profiles, 1, sum) == 0
if (sum(non_overlapping_ranges) > 0){
  print(paste("Removing", sum(non_overlapping_ranges), "non-overlapping ranges"))
  neg_windows$profiles <- neg_windows$profiles[!non_overlapping_ranges,]
  neg_windows$metadata <- neg_windows$metadata[!non_overlapping_ranges,]
}

rows = nrow(neg_windows$metadata)
paste("Extracted profiles:", rows)
## [1] "Extracted profiles: 165580"
krows <- round(rows/1000)

Exploration of the obtained profiles (all negative set)

## Explore the resulting profiles

# Chr distribution
plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows ", krows, "k)", sep=""))

# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
 # facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()

# Plot distribution of ATAC-Seq peaks CAGE total coverage 
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 1000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank())

# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows$profiles %>% 
  mutate(atac_start = neg_windows$metadata$atac_start) %>%         
  mutate(max_tss_score = apply(neg_windows$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  #facet_zoom(ylim=c(0, 60), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none")

# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-   
  get_score_distribution_by_pos_all_chr(neg_windows)
  
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  coord_cartesian(ylim = c(-1000, 1000)) +
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw()

# Plot some profiles
plot_set_profiles(neg_windows, chr = "chr1", sort=TRUE) 
## Using window as id variables

plot_set_profiles(neg_windows, chr = "chr6", sort=TRUE) 
## Using window as id variables

plot_set_profiles(neg_windows, chr = "chr11", sort=TRUE) 
## Using window as id variables

plot_set_profiles(neg_windows, chr = "chr2", sort=TRUE) 
## Using window as id variables

plot_set_profiles(neg_windows, chr = "chr3", sort=TRUE) 
## Using window as id variables

plot_set_profiles(neg_windows, chr = "chr4", sort=TRUE) 
## Using window as id variables

Sample negative set and export both negative and positive set profiles and metadata

# Sample from negative set
windows_sampling <- function(windows, size){
  uniform_sampling <- runif(size, 1, nrow(windows$metadata))
  windows$metadata <- windows$metadata[uniform_sampling,]
  windows$profiles <- windows$profiles[uniform_sampling,]
  return(windows)
}

# Plot chr distribution
plot_chr_distribution(pos_windows$metadata, paste("Chr distribution (Positive windows, ",
                      round(nrow(pos_windows$metadata)/1000), "k)", sep = ""))

plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows, ",
                      round(nrow(neg_windows$metadata)/1000), "k)", sep = ""))

neg_windows_sampled <- windows_sampling(neg_windows, size=nrow(pos_windows$metadata))
plot_chr_distribution(neg_windows_sampled$metadata, paste("Chr distribution (Negative windows sampled, ",
                      round(nrow(neg_windows_sampled$metadata)/1000), "k)", sep = ""))

## Export data for ML feeding

# Merge positive and negative examples
windows_profile <- rbind(mutate(pos_windows$profiles, label = 1), 
                         mutate(neg_windows_sampled$profiles, label = 0))
windows_metadata <- rbind(mutate(pos_windows$metadata, label = 1), 
                          mutate(neg_windows_sampled$metadata, label = 0))

# Shuffle the data
index <- sample(nrow(windows_profile))
windows_profile <- windows_profile[index,]
windows_metadata <- windows_metadata[index,]

paste("Size windows profile", nrow(windows_profile))
## [1] "Size windows profile 121044"
paste("Size positive windows profile (filtered)", nrow(pos_windows$profile))
## [1] "Size positive windows profile (filtered) 60522"
paste("Size negative windows profile (sampled)", nrow(neg_windows_sampled$profile))
## [1] "Size negative windows profile (sampled) 60522"
# Divide train and test by chr
test_index <- windows_metadata$chr %in% c("chr2", "chr3", "chr4")
windows_profile_test <- windows_profile[test_index,]
windows_metadata_test <- windows_metadata[test_index,]
windows_profile_train <- windows_profile[!test_index,]
windows_metadata_train <- windows_metadata[!test_index,]

# Export
write_csv(windows_profile, "../Data/ML_input/profiles_rank_negNoFiltered_v2.csv")
write_csv(windows_metadata, "../Data/ML_input/metadata_rank_negNoFiltered_v2.csv")
write_csv(windows_profile_test, "../Data/ML_input/profiles_rank_negNoFiltered_v2_test.csv")
write_csv(windows_metadata_test, "../Data/ML_input/metadata_rank_negNoFiltered_v2_test.csv")
write_csv(windows_profile_train, "../Data/ML_input/profiles_rank_negNoFiltered_v2_train.csv")
write_csv(windows_metadata_train, "../Data/ML_input/metadata_rank_negNoFiltered_v2_train.csv")

Exploration of the obtained profiles (sampled negative set)

# Check
rows = nrow(neg_windows_sampled$metadata)
paste("Extracted profiles:", rows)
## [1] "Extracted profiles: 60522"
krows <- round(rows/1000)


## Explore the resulting profiles

# Chr distribution
plot_chr_distribution(neg_windows_sampled$metadata, paste("Chr distribution (Negative windows ", krows, "k)", sep=""), 
                      save=TRUE, filename="negative_set_NoFiltered/chr_distribution_negSampled.png")
## Saving 7 x 5 in image

# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows_sampled$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
 # facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw() -> plot_cage_by_peak_pos
plot_cage_by_peak_pos

ggsave("../Plots/input_exploration/negative_set_NoFiltered/cage_by_peak_pos_negSampled.png", 
       plot_cage_by_peak_pos, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Plot distribution of ATAC-Seq peaks CAGE total coverage 
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows_sampled$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none") -> windows_total_cage_score_plot1
windows_total_cage_score_plot1

ggsave("../Plots/input_exploration/negative_set_NoFiltered/windows_total_cage_score1_negSampled.png", 
       windows_total_cage_score_plot1, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 1000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank()) -> windows_total_cage_score_plot2
windows_total_cage_score_plot2

ggsave("../Plots/input_exploration/negative_set_NoFiltered/windows_total_cage_score2_negSampled.png", 
       windows_total_cage_score_plot2, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows_sampled$profiles %>% 
  mutate(atac_start = neg_windows_sampled$metadata$atac_start) %>%         
  mutate(max_tss_score = apply(neg_windows_sampled$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  facet_zoom(ylim=c(0, 60), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none") -> max_tss_score_plot
max_tss_score_plot

ggsave("../Plots/input_exploration/negative_set_NoFiltered/max_tss_score_plot_negSampled.png", 
       max_tss_score_plot, 
       height = 4.5, dpi = 300) 
## Saving 7 x 4.5 in image
# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-   
  get_score_distribution_by_pos_all_chr(neg_windows_sampled)
  
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  coord_cartesian(ylim = c(-1000, 1000)) +
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw() -> windows_score_by_pos_by_chr
windows_score_by_pos_by_chr

ggsave("../Plots/input_exploration/negative_set_NoFiltered/windows_score_by_pos_by_chr_negSampled.png", 
     windows_score_by_pos_by_chr, 
     height = 7, width = 10, dpi = 300) 

# Free axis
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep="")),
             scales="free") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw() -> windows_score_by_pos_by_chr
windows_score_by_pos_by_chr

ggsave("../Plots/input_exploration/negative_set_NoFiltered/windows_score_by_pos_by_chr_negSampled_freeAxis.png", 
     windows_score_by_pos_by_chr, 
     height = 7, width = 10, dpi = 300) 
# Plot some profiles (free axis)
plot_set_profiles(neg_windows_sampled, chr="chr1", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_NoFiltered/profiles1_negSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_sampled, chr="chr6", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_NoFiltered/profiles2_negSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_sampled, chr="chr11", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_NoFiltered/profiles3_negSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_sampled, chr="chr2", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_NoFiltered/profiles4_negSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_sampled, chr="chr3", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_NoFiltered/profiles5_negSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_sampled, chr="chr4", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_NoFiltered/profiles6_negSampled.png") 
## Using window as id variables

Exploration of the negative set sampled from filtered negative profiles (shuffling 1807k ranges without overlaps, filtering by CAGE requirement (atleast 2 IN TOTAL), sampling)

To check for filtering or not filtering I could create different datasets unfiltered and different kind of filters, then I could fit a model on each dataset which will be tested on all others. I could choose the one that performs better on average.

Filtering and sampling

# Filter the negative profiles by CAGE signal 
windows_profiles_filter <- function(windows,
                                    threshold = 1, fun = max){
  filter <- apply(windows$profiles, 1, fun) > threshold
    windows$profiles <-  windows$profile[filter,]
    windows$metadata <- windows$metadata[filter,]
  return(windows)
}

neg_windows_filtered_atleast_2_sum <- windows_profiles_filter(neg_windows, fun = sum)
neg_windows_filtered <- windows_profiles_filter(neg_windows)
neg_windows_filtered_atleast_3_max <- windows_profiles_filter(neg_windows, threshold = 2)

print(paste("Original windows number:", nrow(neg_windows$profiles)))
## [1] "Original windows number: 165580"
print(paste("N. windows after filtering (at least 2 reads in total):", nrow(neg_windows_filtered_atleast_2_sum$profiles)))
## [1] "N. windows after filtering (at least 2 reads in total): 79879"
print(paste("N. windows after filtering (at least a TSS with 2 reads):", nrow(neg_windows_filtered$profiles)))
## [1] "N. windows after filtering (at least a TSS with 2 reads): 63896"
print(paste("N. windows after filtering (at least a TSS with 3 reads):", nrow(neg_windows_filtered_atleast_3_max$profiles)))
## [1] "N. windows after filtering (at least a TSS with 3 reads): 26692"
# Sampling
neg_windows_filtered_sampled <- windows_sampling(neg_windows_filtered_atleast_2_sum, size=nrow(pos_windows$metadata))

Filtered negative set exploration

# Check
rows = nrow(neg_windows_filtered_sampled$metadata)
paste("Extracted profiles:", rows)
## [1] "Extracted profiles: 60522"
krows <- round(rows/1000)


## Explore the resulting profiles

# Chr distribution
plot_chr_distribution(neg_windows_filtered_sampled$metadata, paste("Chr distribution (Negative windows ", krows, "k)", sep=""), 
                      save=TRUE, filename="negative_set_Filtered2total/chr_distribution_negFiltered2totalSampled.png")
## Saving 7 x 5 in image

# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows_filtered_sampled$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
 # facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw() -> plot_cage_by_peak_pos
plot_cage_by_peak_pos

ggsave("../Plots/input_exploration/negative_set_Filtered2total/cage_by_peak_pos_negFiltered2totalSampled.png", 
       plot_cage_by_peak_pos, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Plot distribution of ATAC-Seq peaks CAGE total coverage 
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows_filtered_sampled$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none") -> windows_total_cage_score_plot1
windows_total_cage_score_plot1

ggsave("../Plots/input_exploration/negative_set_Filtered2total/windows_total_cage_score1_negFiltered2totalSampled.png", 
       windows_total_cage_score_plot1, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 1000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank()) -> windows_total_cage_score_plot2
windows_total_cage_score_plot2

ggsave("../Plots/input_exploration/negative_set_Filtered2total/windows_total_cage_score2_negFiltered2totalSampled.png", 
       windows_total_cage_score_plot2, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows_filtered_sampled$profiles %>% 
  mutate(atac_start = neg_windows_filtered_sampled$metadata$atac_start) %>%         
  mutate(max_tss_score = apply(neg_windows_filtered_sampled$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  facet_zoom(ylim=c(0, 60), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none") -> max_tss_score_plot
max_tss_score_plot

ggsave("../Plots/input_exploration/negative_set_Filtered2total/max_tss_score_plot_negFiltered2totalSampled.png", 
       max_tss_score_plot, 
       height = 4.5, dpi = 300) 
## Saving 7 x 4.5 in image
# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-   
  get_score_distribution_by_pos_all_chr(neg_windows_filtered_sampled)
  
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  coord_cartesian(ylim = c(-1000, 1000)) +
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw() -> windows_score_by_pos_by_chr
windows_score_by_pos_by_chr

ggsave("../Plots/input_exploration/negative_set_Filtered2total/windows_score_by_pos_by_chr_negFiltered2totalSampled.png", 
     windows_score_by_pos_by_chr, 
     height = 7, width = 10, dpi = 300) 

# Free axis
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep="")),
             scales="free") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw() -> windows_score_by_pos_by_chr
windows_score_by_pos_by_chr

ggsave("../Plots/input_exploration/negative_set_Filtered2total/windows_score_by_pos_by_chr_negFiltered2totalSampled_freeAxis.png", 
     windows_score_by_pos_by_chr, 
     height = 7, width = 10, dpi = 300) 
# Plot some profiles (free axis)
plot_set_profiles(neg_windows_filtered_sampled, chr="chr1", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2total/profiles1_negFiltered2totalSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr6", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2total/profiles2_negFiltered2totalSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr11", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2total/profiles3_negFiltered2totalSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr2", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2total/profiles4_negFiltered2totalSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr3", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2total/profiles5_negFiltered2totalSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr4", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2total/profiles6_negFiltered2totalSampled.png") 
## Using window as id variables

## Export data for ML feeding

# Merge positive and negative examples
windows_profile <- rbind(mutate(pos_windows$profiles, label = 1), 
                         mutate(neg_windows_filtered_sampled$profiles, label = 0))
windows_metadata <- rbind(mutate(pos_windows$metadata, label = 1), 
                          mutate(neg_windows_filtered_sampled$metadata, label = 0))

# Shuffle the data
index <- sample(nrow(windows_profile))
windows_profile <- windows_profile[index,]
windows_metadata <- windows_metadata[index,]

paste("Size windows profile", nrow(windows_profile))
## [1] "Size windows profile 121044"
paste("Size positive windows profile (filtered)", nrow(pos_windows$profile))
## [1] "Size positive windows profile (filtered) 60522"
paste("Size negative windows profile (filtered and sampled)", nrow(neg_windows_filtered_sampled$profile))
## [1] "Size negative windows profile (filtered and sampled) 60522"
# Divide train and test by chr
test_index <- windows_metadata$chr %in% c("chr2", "chr3", "chr4")
windows_profile_test <- windows_profile[test_index,]
windows_metadata_test <- windows_metadata[test_index,]
windows_profile_train <- windows_profile[!test_index,]
windows_metadata_train <- windows_metadata[!test_index,]

# Export
write_csv(windows_profile, "../Data/ML_input/profiles_rank_negFiltered_atleast2total_v2.csv")
write_csv(windows_metadata, "../Data/ML_input/metadata_rank_negFiltered_atleast2total_v2.csv")
write_csv(windows_profile_test, "../Data/ML_input/profiles_rank_negFiltered_atleast2total_v2_test.csv")
write_csv(windows_metadata_test, "../Data/ML_input/metadata_rank_negFiltered_atleast2total_v2_test.csv")
write_csv(windows_profile_train, "../Data/ML_input/profiles_rank_negFiltered_atleast2total_v2_train.csv")
write_csv(windows_metadata_train, "../Data/ML_input/metadata_rank_negFiltered_atleast2total_v2_train.csv")

Exploration of the negative set sampled from filtered negative profiles (shuffling 1807k ranges without overlaps, filtering by CAGE requirement (atleast a TSS of 2), sampling)

# Sampling from filtered negative set by minimum TSS score
neg_windows_filtered_sampled <- windows_sampling(neg_windows_filtered, size=nrow(pos_windows$metadata))

Filtered negative set exploration

# Check
rows = nrow(neg_windows_filtered_sampled$metadata)
paste("Extracted profiles:", rows)
## [1] "Extracted profiles: 60522"
krows <- round(rows/1000)


## Explore the resulting profiles

# Chr distribution
plot_chr_distribution(neg_windows_filtered_sampled$metadata, paste("Chr distribution (Negative windows ", krows, "k)", sep=""), 
                      save=TRUE, filename="negative_set_Filtered2max/chr_distribution_negFiltered2maxSampled.png")
## Saving 7 x 5 in image

# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows_filtered_sampled$profiles)

cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
 # facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
  labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",  
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
   scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw() -> plot_cage_by_peak_pos
plot_cage_by_peak_pos

ggsave("../Plots/input_exploration/negative_set_Filtered2max/cage_by_peak_pos_negFiltered2maxSampled.png", 
       plot_cage_by_peak_pos, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Plot distribution of ATAC-Seq peaks CAGE total coverage 
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows_filtered_sampled$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>% 
  ggplot(aes(x = total_score, y = n_atac_peaks)) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) + 
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  labs(title = "Windows profiles total score distribution", 
       y = "N. Windows profiles",
       x = "Total CAGE score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none") -> windows_total_cage_score_plot1
windows_total_cage_score_plot1

ggsave("../Plots/input_exploration/negative_set_Filtered2max/windows_total_cage_score1_negFiltered2maxSampled.png", 
       windows_total_cage_score_plot1, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>% 
  ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
  geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
  labs(title = "Windows profiles total score distribution", 
     x = NA,
     y = "Total CAGE score",
     size = "N. Windows profiles",
     col = "") +
  facet_zoom(ylim=c(0, 1000), shrink = FALSE) +
  scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  theme(axis.title.x=element_blank()) -> windows_total_cage_score_plot2
windows_total_cage_score_plot2

ggsave("../Plots/input_exploration/negative_set_Filtered2max/windows_total_cage_score2_negFiltered2maxSampled.png", 
       windows_total_cage_score_plot2, 
       height = 5, dpi = 300) 
## Saving 7 x 5 in image
# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows_filtered_sampled$profiles %>% 
  mutate(atac_start = neg_windows_filtered_sampled$metadata$atac_start) %>%         
  mutate(max_tss_score = apply(neg_windows_filtered_sampled$profiles, 1, max)) %>% 
  select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()

max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) + 
  geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) + 
  facet_zoom(ylim=c(0, 60), shrink = FALSE) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
  coord_cartesian(xlim = c(0, 50)) +
  labs(title = "Max TSS score distribution", 
       y = "N. windows profiles",
       x = "Max TSS score",
       fill = NA) +
  theme_bw() +
  theme(legend.position = "none") -> max_tss_score_plot
max_tss_score_plot

ggsave("../Plots/input_exploration/negative_set_Filtered2max/max_tss_score_plot_negFiltered2maxSampled.png", 
       max_tss_score_plot, 
       height = 4.5, dpi = 300) 
## Saving 7 x 4.5 in image
# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-   
  get_score_distribution_by_pos_all_chr(neg_windows_filtered_sampled)
  
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  coord_cartesian(ylim = c(-1000, 1000)) +
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw() -> windows_score_by_pos_by_chr
windows_score_by_pos_by_chr

ggsave("../Plots/input_exploration/negative_set_Filtered2max/windows_score_by_pos_by_chr_negFiltered2maxSampled.png", 
     windows_score_by_pos_by_chr, 
     height = 7, width = 10, dpi = 300) 

# Free axis
windows_score_distribution_by_pos_all_chr %>% 
  ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
  labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
       x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") + 
  facet_wrap(~factor(chr,
                     levels = paste("chr", c(1:22, "X", "Y"), sep="")),
             scales="free") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw() -> windows_score_by_pos_by_chr
windows_score_by_pos_by_chr

ggsave("../Plots/input_exploration/negative_set_Filtered2max/windows_score_by_pos_by_chr_negFiltered2maxSampled_freeAxis.png", 
     windows_score_by_pos_by_chr, 
     height = 7, width = 10, dpi = 300) 
# Plot some profiles (free axis)
plot_set_profiles(neg_windows_filtered_sampled, chr="chr1", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2max/profiles1_negFiltered2maxSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr6", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2max/profiles2_negFiltered2maxSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr11", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2max/profiles3_negFiltered2maxSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr2", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2max/profiles4_negFiltered2maxSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr3", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2max/profiles5_negFiltered2maxSampled.png") 
## Using window as id variables

plot_set_profiles(neg_windows_filtered_sampled, chr="chr4", ylim=c(NA,NA), scales="free", sort=TRUE, save=TRUE, 
                  filename="negative_set_Filtered2max/profiles6_negFiltered2maxSampled.png") 
## Using window as id variables

## Export data for ML feeding

# Merge positive and negative examples
windows_profile <- rbind(mutate(pos_windows$profiles, label = 1), 
                         mutate(neg_windows_filtered_sampled$profiles, label = 0))
windows_metadata <- rbind(mutate(pos_windows$metadata, label = 1), 
                          mutate(neg_windows_filtered_sampled$metadata, label = 0))

# Shuffle the data
index <- sample(nrow(windows_profile))
windows_profile <- windows_profile[index,]
windows_metadata <- windows_metadata[index,]

paste("Size windows profile", nrow(windows_profile))
## [1] "Size windows profile 121044"
paste("Size positive windows profile (filtered)", nrow(pos_windows$profile))
## [1] "Size positive windows profile (filtered) 60522"
paste("Size negative windows profile (filtered and sampled)", nrow(neg_windows_filtered_sampled$profile))
## [1] "Size negative windows profile (filtered and sampled) 60522"
# Divide train and test by chr
test_index <- windows_metadata$chr %in% c("chr2", "chr3", "chr4")
windows_profile_test <- windows_profile[test_index,]
windows_metadata_test <- windows_metadata[test_index,]
windows_profile_train <- windows_profile[!test_index,]
windows_metadata_train <- windows_metadata[!test_index,]

# Export
write_csv(windows_profile, "../Data/ML_input/profiles_rank_negFiltered_atleast2max_v2.csv")
write_csv(windows_metadata, "../Data/ML_input/metadata_rank_negFiltered_atleast2max_v2.csv")
write_csv(windows_profile_test, "../Data/ML_input/profiles_rank_negFiltered_atleast2max_v2_test.csv")
write_csv(windows_metadata_test, "../Data/ML_input/metadata_rank_negFiltered_atleast2max_v2_test.csv")
write_csv(windows_profile_train, "../Data/ML_input/profiles_rank_negFiltered_atleast2max_v2_train.csv")
write_csv(windows_metadata_train, "../Data/ML_input/metadata_rank_negFiltered_atleast2max_v2_train.csv")